from IPython.display import HTML
import pandas as pd
import altair as alt
import geopandas as gpd
import pycountry
import numpy as np
import os
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
My dataset contains some of the major songs attributes (like acousticness, loudness, danceability, energy, etc) that describe certain songs from years 1960's up to 2010's (there's an equal division of hit songs and non-hit ones) therefore I decided to build the analysis mostly on that data. Besides that I also added the info about release dates, genres and country of artist's origin.
path = "data-parsed/"
files = [path + f for f in os.listdir(path) if "dataset" in f]
initial_dataset = pd.DataFrame()
for f in files:
tmp = pd.read_csv(f)
tmp = tmp.drop(tmp.columns[0], axis=1)
years = {"00s": 2010, "10s": 2020, "60s": 1970, "70s": 1980, "80s": 1990, "90s": 2000}
decade = f.split(".")[-2][-3:]
tmp["decade"] = years[decade]
initial_dataset = pd.concat([initial_dataset, tmp], ignore_index=True)
initial_dataset.dropna(inplace=True)
initial_dataset = initial_dataset[initial_dataset["target"] == 1]
initial_dataset.columns
Index(['track', 'artist', 'uri', 'danceability', 'energy', 'key', 'loudness',
'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness',
'valence', 'tempo', 'duration_ms', 'time_signature', 'chorus_hit',
'sections', 'target', 'country', 'country_code', 'release_date',
'style', 'genre', 'decade'],
dtype='object')
initial_dataset.describe()
| danceability | energy | key | loudness | mode | speechiness | acousticness | instrumentalness | liveness | valence | tempo | duration_ms | time_signature | chorus_hit | sections | target | release_date | decade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 9286.000000 | 9286.000000 | 9286.000000 | 9286.000000 | 9286.000000 | 9286.000000 | 9286.000000 | 9286.000000 | 9286.000000 | 9286.000000 | 9286.000000 | 9.286000e+03 | 9286.000000 | 9286.000000 | 9286.000000 | 9286.0 | 9286.000000 | 9286.000000 |
| mean | 0.590372 | 0.641361 | 5.168210 | -8.594532 | 0.730993 | 0.059157 | 0.246335 | 0.025553 | 0.188248 | 0.596081 | 121.052062 | 2.365007e+05 | 3.950355 | 39.439935 | 10.484385 | 1.0 | 1928.683287 | 1993.964032 |
| std | 0.145936 | 0.201670 | 3.545462 | 3.619409 | 0.443468 | 0.060647 | 0.257790 | 0.109918 | 0.160314 | 0.237670 | 27.367421 | 6.380371e+04 | 0.266636 | 17.578066 | 2.867590 | 0.0 | 349.675196 | 15.857415 |
| min | 0.000000 | 0.020400 | 0.000000 | -27.010000 | 0.000000 | 0.000000 | 0.000002 | 0.000000 | 0.013600 | 0.000000 | 0.000000 | 5.938700e+04 | 0.000000 | 13.307160 | 3.000000 | 1.0 | 0.000000 | 1970.000000 |
| 25% | 0.497000 | 0.498000 | 2.000000 | -10.943750 | 0.000000 | 0.031300 | 0.033825 | 0.000000 | 0.087400 | 0.407000 | 100.312000 | 1.975065e+05 | 4.000000 | 27.657350 | 9.000000 | 1.0 | 1979.000000 | 1980.000000 |
| 50% | 0.598000 | 0.663000 | 5.000000 | -8.086000 | 1.000000 | 0.038500 | 0.144000 | 0.000011 | 0.126000 | 0.616000 | 119.619000 | 2.304335e+05 | 4.000000 | 35.593610 | 10.000000 | 1.0 | 1992.000000 | 1990.000000 |
| 75% | 0.692000 | 0.805000 | 8.000000 | -5.775250 | 1.000000 | 0.056900 | 0.402000 | 0.000745 | 0.244000 | 0.794000 | 136.232000 | 2.662008e+05 | 4.000000 | 46.327618 | 12.000000 | 1.0 | 2004.000000 | 2010.000000 |
| max | 0.988000 | 0.996000 | 11.000000 | -0.716000 | 1.000000 | 0.941000 | 0.994000 | 0.982000 | 0.999000 | 0.985000 | 217.396000 | 1.367093e+06 | 5.000000 | 262.615400 | 55.000000 | 1.0 | 2020.000000 | 2020.000000 |
initial_dataset["index"] = initial_dataset.index
factors = initial_dataset[["index", "target", "decade", "energy", "danceability", "acousticness", "speechiness", "liveness"]]
factors = factors.melt(['index', "target", "decade"])
factors.columns
Index(['index', 'target', 'decade', 'variable', 'value'], dtype='object')
For the first visual I'm exploring the distribution of the song features for each year, for both hits and non-hits songs.
I found out that the most appropriate visual type would be the box-plot as it allows to represent both statistical and visual info for each of the feature fields.
Advantages: easy to describe the distribution of the desired variables Disadvantages: restrictions in the interactivity, can't apply the selection. In my case that was not a major problem as there wasn't much years
alt.Chart(factors).mark_boxplot(extent="min-max")\
.encode(x=alt.X("decade:O", title="Years"),
y=alt.Y("value:Q", title="Value"),
color=alt.Color("count(value)", scale=alt.Scale(scheme='purples'), legend=None),
column=alt.Column('variable:N', title="Feature"))\
.properties(width=220, height=220, title="Feature values distribution for hit songs")
Second insight - is the exploration of the distribution of top hits accross the available countries through years 1960's - 2010's.
Here I'm using the map with the slider that allows us to choose the year of interest.
slider = alt.binding_range(min=1970, max=2020, step=10)
selector = alt.selection_single(name="decade", fields=['decade'],
bind=slider, init={'decade': 1970})
keys = alt.Chart(initial_dataset)\
.add_selection(selector)\
.transform_filter(selector)\
.transform_joinaggregate(song_counter='count()')\
.transform_calculate(songs_percentage="1 / datum.song_counter")\
.mark_bar()\
.encode(x=alt.X("key:O", title="ID of the key", axis=alt.Axis(labelAngle=360)),
y=alt.Y('sum(songs_percentage):Q', title="No. of songs", axis=alt.Axis(format="%")),
color=alt.Color("sum(songs_percentage):Q", scale=alt.Scale(scheme='purples'), legend=None),
tooltip=[alt.Tooltip("key", title="ID of the key"),
alt.Tooltip('count()', title='No. of songs'),
alt.Tooltip("sum(songs_percentage):Q", title="Percentage of the songs", format=".1%")])\
.properties(width=600, height=400, title="Most common song keys through the years, %")
styles = alt.Chart(initial_dataset)\
.add_selection(selector)\
.transform_filter(selector)\
.transform_joinaggregate(song_counter2='count()')\
.transform_calculate(songs_percentages="1 / datum.song_counter2")\
.mark_bar()\
.encode(x=alt.X("style:O", title="\nStyle", axis=alt.Axis(labelAngle=360),
sort=alt.EncodingSortField(field="style", op="count", order='descending')),
y=alt.Y('sum(songs_percentages):Q', title="No. of songs", axis=alt.Axis(format="%")),
color=alt.Color("sum(songs_percentages):Q", scale=alt.Scale(scheme='purples'), legend=None),
tooltip=[alt.Tooltip("style", title="Style of the song"),
alt.Tooltip("count()", title="No. of songs"),
alt.Tooltip("sum(songs_percentages):Q", title="Percentage of the songs", format=".1%")])\
.properties(width=600, height=400, title="Most common song styles through the years, %")
concc=alt.hconcat(keys, styles, center=True).configure_concat(spacing=10).resolve_scale(color='independent')
display(HTML("""
<style>
.vega-bind {
text-align:center;
}
</style>
"""))
display(concc)
world = gpd.read_file("world-countries.json")
world.id.unique()
array(['AFG', 'AGO', 'ALB', 'ARE', 'ARG', 'ARM', 'ATA', 'ATF', 'AUS',
'AUT', 'AZE', 'BDI', 'BEL', 'BEN', 'BFA', 'BGD', 'BGR', 'BHS',
'BIH', 'BLR', 'BLZ', 'BOL', 'BRA', 'BRN', 'BTN', 'BWA', 'CAF',
'CAN', 'CHE', 'CHL', 'CHN', 'CIV', 'CMR', 'COD', 'COG', 'COL',
'CRI', 'CUB', '-99', 'CYP', 'CZE', 'DEU', 'DJI', 'DNK', 'DOM',
'DZA', 'ECU', 'EGY', 'ERI', 'ESP', 'EST', 'ETH', 'FIN', 'FJI',
'FLK', 'FRA', 'GAB', 'GBR', 'GEO', 'GHA', 'GIN', 'GMB', 'GNB',
'GNQ', 'GRC', 'GRL', 'GTM', 'GUY', 'HND', 'HRV', 'HTI', 'HUN',
'IDN', 'IND', 'IRL', 'IRN', 'IRQ', 'ISL', 'ISR', 'ITA', 'JAM',
'JOR', 'JPN', 'KAZ', 'KEN', 'KGZ', 'KHM', 'KOR', 'KWT', 'LAO',
'LBN', 'LBR', 'LBY', 'LKA', 'LSO', 'LTU', 'LUX', 'LVA', 'MAR',
'MDA', 'MDG', 'MEX', 'MKD', 'MLI', 'MMR', 'MNE', 'MNG', 'MOZ',
'MRT', 'MWI', 'MYS', 'NAM', 'NCL', 'NER', 'NGA', 'NIC', 'NLD',
'NOR', 'NPL', 'NZL', 'OMN', 'PAK', 'PAN', 'PER', 'PHL', 'PNG',
'POL', 'PRI', 'PRK', 'PRT', 'PRY', 'QAT', 'ROU', 'RUS', 'RWA',
'SAU', 'SDN', 'SDS', 'SEN', 'SLB', 'SLE', 'SLV', 'SOM', 'SRB',
'SUR', 'SVK', 'SVN', 'SWE', 'SWZ', 'SYR', 'TCD', 'TGO', 'THA',
'TJK', 'TKM', 'TLS', 'TTO', 'TUN', 'TUR', 'TWN', 'TZA', 'UGA',
'UKR', 'URY', 'USA', 'UZB', 'VEN', 'VNM', 'VUT', 'PSE', 'YEM',
'ZAF', 'ZMB', 'ZWE'], dtype=object)
for index, row in initial_dataset.iterrows():
location_ = row["country_code"]
search_ = pycountry.countries.search_fuzzy(location_)
initial_dataset.loc[index, "location"] = search_[0].alpha_3
merged = pd.merge(world, initial_dataset, how="outer", left_on="id", right_on="location")
merged
| id | name | geometry | track | artist | uri | danceability | energy | key | loudness | ... | sections | target | country | country_code | release_date | style | genre | decade | index | location | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Afghanistan | POLYGON ((61.21082 35.65007, 62.23065 35.27066... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | AGO | Angola | MULTIPOLYGON (((16.32653 -5.87747, 16.57318 -6... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ALB | Albania | POLYGON ((20.59025 41.85540, 20.46317 41.51509... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ARE | United Arab Emirates | POLYGON ((51.57952 24.24550, 51.75744 24.29407... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ARG | Argentina | MULTIPOLYGON (((-65.50000 -55.20000, -66.45000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9425 | NaN | NaN | None | Pour It Up | Rihanna | spotify:track:76VtA90NKurNqaQpV126Ue | 0.785 | 0.478 | 2.0 | -8.735 | ... | 8.0 | 1.0 | Saint Michael, Barbados | BB | 2012.0 | Rock/Pop | Pop | 2020.0 | 40235.0 | BRB |
| 9426 | NaN | NaN | None | Rude Boy | Rihanna | spotify:track:60jzFy6Nn4M0iD1d94oteF | 0.563 | 0.750 | 11.0 | -4.496 | ... | 9.0 | 1.0 | Saint Michael, Barbados | BB | 2009.0 | Rock/Pop | Pop | 2020.0 | 40397.0 | BRB |
| 9427 | NaN | NaN | None | Impossible | Shontelle | spotify:track:45EDI3rk0f4cAMt9f8b56R | 0.599 | 0.624 | 8.0 | -3.631 | ... | 11.0 | 1.0 | Saint James, Barbados | BB | 2010.0 | Rock/Pop | R&B | 2020.0 | 40481.0 | BRB |
| 9428 | NaN | NaN | None | Love On The Brain | Rihanna | spotify:track:5oO3drDxtziYU2H1X23ZIp | 0.509 | 0.637 | 4.0 | -4.830 | ... | 11.0 | 1.0 | Saint Michael, Barbados | BB | 2016.0 | Rock/Pop | Pop | 2020.0 | 40507.0 | BRB |
| 9429 | NaN | NaN | None | What Now | Rihanna | spotify:track:0aUWfpD3PlSv3FTTKcT2rN | 0.402 | 0.696 | 8.0 | -4.799 | ... | 11.0 | 1.0 | Saint Michael, Barbados | BB | 2012.0 | Rock/Pop | Pop | 2020.0 | 41097.0 | BRB |
9430 rows × 30 columns
filtered = merged.dropna().reset_index(drop=True).groupby(["name"]).count()["track"].reset_index()
filtered_ = pd.merge(merged, filtered, how="left", left_on="name", right_on="name")
filtered_
| id | name | geometry | track_x | artist | uri | danceability | energy | key | loudness | ... | target | country | country_code | release_date | style | genre | decade | index | location | track_y | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | AFG | Afghanistan | POLYGON ((61.21082 35.65007, 62.23065 35.27066... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | AGO | Angola | MULTIPOLYGON (((16.32653 -5.87747, 16.57318 -6... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 2 | ALB | Albania | POLYGON ((20.59025 41.85540, 20.46317 41.51509... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | ARE | United Arab Emirates | POLYGON ((51.57952 24.24550, 51.75744 24.29407... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | ARG | Argentina | MULTIPOLYGON (((-65.50000 -55.20000, -66.45000... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 9425 | NaN | NaN | None | Pour It Up | Rihanna | spotify:track:76VtA90NKurNqaQpV126Ue | 0.785 | 0.478 | 2.0 | -8.735 | ... | 1.0 | Saint Michael, Barbados | BB | 2012.0 | Rock/Pop | Pop | 2020.0 | 40235.0 | BRB | NaN |
| 9426 | NaN | NaN | None | Rude Boy | Rihanna | spotify:track:60jzFy6Nn4M0iD1d94oteF | 0.563 | 0.750 | 11.0 | -4.496 | ... | 1.0 | Saint Michael, Barbados | BB | 2009.0 | Rock/Pop | Pop | 2020.0 | 40397.0 | BRB | NaN |
| 9427 | NaN | NaN | None | Impossible | Shontelle | spotify:track:45EDI3rk0f4cAMt9f8b56R | 0.599 | 0.624 | 8.0 | -3.631 | ... | 1.0 | Saint James, Barbados | BB | 2010.0 | Rock/Pop | R&B | 2020.0 | 40481.0 | BRB | NaN |
| 9428 | NaN | NaN | None | Love On The Brain | Rihanna | spotify:track:5oO3drDxtziYU2H1X23ZIp | 0.509 | 0.637 | 4.0 | -4.830 | ... | 1.0 | Saint Michael, Barbados | BB | 2016.0 | Rock/Pop | Pop | 2020.0 | 40507.0 | BRB | NaN |
| 9429 | NaN | NaN | None | What Now | Rihanna | spotify:track:0aUWfpD3PlSv3FTTKcT2rN | 0.402 | 0.696 | 8.0 | -4.799 | ... | 1.0 | Saint Michael, Barbados | BB | 2012.0 | Rock/Pop | Pop | 2020.0 | 41097.0 | BRB | NaN |
9430 rows × 31 columns